Load required libraries.
library(ggplot2)
library(grid)
library(magrittr)
library(dplyr)
library(tidyr)
library(quadprog)
source('data_loading.R')
Load in Wordbank common data.
wordbank <- connect.to.wordbank("prod")
common.tables <- get.common.tables(wordbank)
items <- get.item.data(common.tables$wordmapping,
common.tables$instrumentsmap,
common.tables$category) %>%
filter(type == "word")
instrument.tables <- get.instrument.tables(wordbank, common.tables$instruments)
Get vocabulary composition data for all languages.
get.vocab.comp <- function(input_language, input_form) {
lang.vocab.items <- filter(items, language == input_language, form == input_form) %>%
filter(lexical_category %in% c("nouns", "predicates", "function_words")) %>%
rename(column = item.id) %>%
mutate(item.id = as.numeric(substr(column, 6, nchar(column))))
lang.instrument.table <- filter(instrument.tables,
language == input_language,
form == input_form)$table[[1]]
lang.vocab.data <- get.instrument.data(lang.instrument.table,
lang.vocab.items$column) %>%
left_join(select(lang.vocab.items, item.id, lexical_category, item, definition)) %>%
mutate(value = ifelse(is.na(value), "", value),
produces = value == "produces",
understands = value == "produces" | value == "understands")
num.words <- nrow(lang.vocab.items)
lang.vocab.summary <- lang.vocab.data %>%
group_by(data_id, lexical_category) %>%
summarise(production.num = sum(produces),
production.prop = sum(produces) / length(produces),
comprehension.num = sum(understands),
comprehension.prop = sum(understands) / length(understands))
lang.vocab.sizes <- lang.vocab.summary %>%
summarise(production.vocab = sum(production.num) / num.words,
comprehension.vocab = sum(comprehension.num) / num.words)
lang.vocab.summary %>%
left_join(lang.vocab.sizes) %>%
select(-production.num, -comprehension.num) %>%
mutate(language = input_language,
form = input_form)
}
form.vocab.comp <- function(input_form) {
bind_rows(sapply(unique(filter(instrument.tables, form == input_form)$language),
function(lang) get.vocab.comp(lang, input_form),
simplify = FALSE)) %>%
mutate(lexical_category = factor(lexical_category,
levels=c("nouns", "predicates", "function_words"),
labels=c("Nouns", "Predicates", "Function Words"))) %>%
gather(measure.var, value,
production.prop, production.vocab,
comprehension.prop, comprehension.vocab) %>%
extract(measure.var, c("measure", "var"), "([[:alnum:]]+)\\.([[:alnum:]]+)") %>%
spread(var, value)
}
wg.vocab.comp <- form.vocab.comp("WG")
ws.vocab.comp <- form.vocab.comp("WS") %>%
filter(measure == "production")
vocab.comp <- bind_rows(ws.vocab.comp, wg.vocab.comp)
Show sample size of each instrument.
sample_sizes <- vocab.comp %>%
group_by(language, form, measure, lexical_category) %>%
summarise(n = n()) %>%
ungroup() %>%
select(language, form, n) %>%
distinct()
kable(sample_sizes)
| language | form | n |
|---|---|---|
| BSL | WG | 162 |
| Cantonese | WS | 987 |
| Croatian | WG | 250 |
| Croatian | WS | 377 |
| Danish | WS | 3714 |
| English | WG | 2416 |
| English | WS | 5824 |
| German | WS | 1183 |
| Hebrew | WG | 63 |
| Hebrew | WS | 253 |
| Italian | WS | 752 |
| Mandarin | WS | 1056 |
| Norwegian | WG | 3025 |
| Norwegian | WS | 12969 |
| Russian | WG | 768 |
| Russian | WS | 1037 |
| Spanish | WG | 778 |
| Spanish | WS | 1094 |
| Swedish | WG | 474 |
| Swedish | WS | 900 |
| Turkish | WG | 1115 |
| Turkish | WS | 2422 |
Base plot for looking at vocabulary composition.
base_plot <- function(input_form, input_measure) {
ggplot(filter(vocab.comp, form == input_form, measure == input_measure),
aes(x = vocab, y = prop, colour = lexical_category)) +
facet_wrap(~ language) +
geom_jitter(size = 0.7) +
geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "Proportion of Category\n") +
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "\nVocabulary Size") +
scale_color_brewer(palette = "Set1", name = "Lexical Category") +
theme_bw(base_size = 12) +
theme(legend.position = c(0.068, 0.95),
legend.text = element_text(size = 9),
legend.title = element_text(size = 9, lineheight = unit(0.8, "char")),
legend.key.height = unit(0.8, "char"),
legend.key.width = unit(0.3, "cm"),
legend.key = element_blank(),
legend.background = element_rect(fill = "transparent"),
text = element_text(family = font))
}
Plot WS productive vocabulary composition as a function of vocabulary size for each language.
base_plot("WS", "production") + geom_jitter(size = 0.7)
Plot WG productive vocabulary composition as a function of vocabulary size for each language.
base_plot("WG", "production") + geom_jitter(size = 0.7)
Plot WG receptive vocabulary composition as a function of vocabulary size for each language.
base_plot("WG", "comprehension") + geom_jitter(size = 0.7)
Make an S3 class for a clm (constrained linear model), which uses quadratic programming to run a regression on data with a specified formula, subject to the constraint that the coefficients of the regression sum to 1 (in the future could support arbitrary constraints on the coefficients).
clm <- function(formula, data, ...) {
M <- model.frame(formula, data)
y <- M[,1]
X <- as.matrix(M[,-1])
s <- solve.QP(t(X) %*% X, t(y) %*% X, matrix(1, nr=ncol(X), nc=1), 1, meq=1)
class(s) <- "clm"
s$formula <- formula
return(s)
}
# S3 predict method for clm
predict.clm <- function(object, newdata) {
M <- as.matrix(model.frame(object$formula[-2], newdata))
s <- object$solution
p <- (M %*% s)
rownames(p) <- NULL
p[,1]
}
# S3 predictdf method for clm (called by stat_smooth)
predictdf.clm <- function(model, xseq, se, level) {
pred <- predict(model, newdata = data.frame(x = xseq))
data.frame(x = xseq, y = as.vector(pred))
}
Plot WS productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.
base_plot("WS", "production") +
geom_smooth(method = "clm", formula = y ~ I(x^3) + I(x^2) + x - 1)
Plot WG productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.
base_plot("WG", "production") +
geom_smooth(method = "clm", formula = y ~ I(x^3) + I(x^2) + x - 1)
Plot WG receptive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.
base_plot("WG", "comprehension") +
geom_smooth(method = "clm", formula = y ~ I(x^3) + I(x^2) + x - 1)
Function for resampling data.
sample.areas <- function(d, num.times=1000) {
poly.area <- function(group.data) {
model = clm(prop ~ I(vocab^3) + I(vocab^2) + vocab - 1,
data = group.data)
return((model$solution %*% c(1/4,1/3,1/2) - 0.5)[1])
}
counter = 1
sample.area <- function(d) {
d.frame <- d %>%
group_by(language, form, measure) %>%
sample_frac(replace = TRUE) %>% # resample kids
group_by(language, form, measure, lexical_category) %>%
do(area = poly.area(.)) %>%
mutate(area = area[1]) %>%
rename_(.dots = setNames("area", counter))
counter <<- counter + 1 # increment counter outside scope
return(d.frame)
}
areas <- replicate(num.times, sample.area(d), simplify=FALSE)
Reduce(left_join, areas) %>%
gather(sample, area, -language, -form, -measure, -lexical_category)
}
Resample data, compute area for each sample, find the mean and CI of the area estimate.
areas <- sample.areas(vocab.comp, 100)
area.summary <- areas %>%
group_by(language, form, measure, lexical_category) %>%
summarise(mean = mean(area),
ci.high = quantile(area, 0.975),
ci.low = quantile(area, 0.025)) %>%
ungroup() %>%
mutate(language = factor(language))
#nouns <- filter(area.summary, lexical_category == "Nouns")
#noun.levels <- nouns$language[order(nouns$mean, nouns$language, decreasing = FALSE)]
#area.summary %<>% ungroup() %>% mutate(language = factor(language, levels = noun.levels))
Plot each lexical category’s area estimate by language, form, and measure.
ggplot(area.summary, aes(y = language, x = mean, col = lexical_category)) +
facet_grid(lexical_category ~ form + measure) +
geom_point() +
geom_segment(aes(x = ci.low, xend = ci.high,
y = language, yend = language)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
scale_colour_brewer(palette = "Set1", name = "", guide=FALSE) +
scale_y_discrete(limits = rev(levels(area.summary$language))) +
theme_bw() +
theme(text = element_text(family = font)) +
ylab("") +
xlab("\nRelative representation in early vocabulary")